package mpo.serve.processer;

import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

public class TencentProcesser implements PageProcessor {

	@SuppressWarnings("deprecation")
	private Site site = Site.me().setDomain("news.qq.com")
			.addStartUrl("http://news.qq.com")
			.setCharset("gb2312");

	public void process(Page page) {
		String urlRegex = "http://news.qq.com/\\w+/\\d+/\\d+.htm";
		List<String> links = page.getHtml().links().regex(urlRegex).all();
		if (!page.getRequest().getUrl().matches(urlRegex)) {
			page.addTargetRequests(links);
		}
		if (page.getUrl().toString().matches(urlRegex)) {
			page.putField("url", page.getUrl());
			page.putField("title", page.getHtml().xpath("//h1/text()"));
			page.putField("keywords",
					page.getHtml().xpath("//meta[@name=keywords]/@content")
							.toString().replace(' ', ','));
			page.putField("description",
					page.getHtml().xpath("//meta[@name=description]/@content"));
			// page.putField("content", new
			// Html(page.getHtml().xpath("//div[@id=Cnt-Main-Article-QQ]").toString()).smartContent());
			String str = page.getHtml().xpath("//div[@id=Cnt-Main-Article-QQ]")
					.toString().replaceAll("<[^>]*>", "");
			page.putField("content", str);
			// page.putField("date",
			// page.getHtml().xpath("//span[@id=pub_date]/text()"));
			// page.putField("media",
			// page.getHtml().xpath("//span[@id=media_name]/*/text()"));
			page.putField("source", "腾讯");
		}
	}

	public Site getSite() {
		return site;
	}

}
